# coding=UTF-8
'''
Created on 21 Apr 2012

@author: R
'''

import re
from sys import argv
from base import processpage, search_term

def get_total_page_number(url,proxy):
    pagetext = processpage(url,proxy)
    lastpage = re.compile("0_(\d+).html\"  >尾")
    totalpage_str = lastpage.search(pagetext).group(1)
    return int(totalpage_str)


def get_list(url,proxy):
    pagetext = processpage(url,proxy)
    lands = re.compile("list_sub(.*?)list_sub_hover").findall(pagetext)
    ll=''
    for land in lands:
        if len(land)<50:
            continue
        lurl = re.compile("href..(http.*?html)").search(land)
        ltitle = re.compile("title..(.*?)\"").search(land)
        lusage = re.compile("规划用途[^>]*>[^>]*>(.*?)<").search(land)
        lstime = re.compile("起始时间[^>]*>[^>]*>(.*?)<").search(land)
        lsize = re.compile("土地面积[^>]*>[^>]*>(.*?)<").search(land)
        lcsize = re.compile("规划建筑面积[^>]*>[^>]*>(.*?)<").search(land)
        lprice = re.compile("成交价[^>]*>[^>]*>(.*?)<").search(land)
        ldate = re.compile("成交日期[^>]*>[^>]*>(.*?)<").search(land)
        lv = search_term(ltitle)+','+search_term(lurl)+','+search_term(lusage)+','+search_term(lstime).strip('&nbsp;')+','+search_term(lsize)+','+search_term(lcsize)+','+search_term(lprice)+','+search_term(ldate)+'\n'
        ll = ll+lv
    return ll

def run():
    url="http://www.landlist.cn/market/110100__1_______0_" #110100位置(postcode)+0全部1住宅2商办+XX
    proxy='0'
    eurl = url+'1.html'
    startpage = int(argv[2])
    endpage = 2  #手动设置中止页 如改成get_total_page_number(eurl,proxy) 则自动获得最末页

    fh_putlist=open(argv[1],"a")    
    for i in range(startpage,endpage):
        lurl=url+str(i)+".html"
        fh_putlist.write(get_list(lurl,proxy)) 
        
if __name__ == '__main__':
    argv.append("..\\lists.txt") #设定输出文件名（默认在执行代码目录上层目录下）
    argv.append(1) #设置开始页码 如1从第1页开始
    run()